In [ ]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import joblib
import os

import warnings
warnings.filterwarnings('ignore')  
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文  

# 设置Seaborn样式
sns.set(font='Microsoft YaHei')
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs:
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
d:\ProgramData\Anaconda3\envs\pytorch\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)
In [ ]:
# 获取字典保存各个模型的最终结果
result_model = {}

# target = ['TN loss (%)', 'NH3-N loss (%)', 'N2O-N loss (%)']
target = 'NH3-N loss (%)' # 要预测啥就换个名字
# target = ['EF %', 'LN(EF)', 'LNR(N2O)', 'N2O rate(kg N ha-1 y-1)']

output_file = 'output/TN_NH3_N2O'
input_file = 'data/TN_NH3_N2O'
model_save_file = f'output/TN_NH3_N2O/model_{target}'
os.makedirs(output_file, exist_ok=True)
os.makedirs(model_save_file, exist_ok=True)
In [ ]:
data_path = f'{input_file}/data_for_{target}.csv'
data_tree_path = f'{input_file}/data_for_tree_{target}.csv'
model_performance_path = f'{output_file}/Model_{target}.html'
model_performance_json = f'{output_file}/result_model_{target}.json'

数据集划分¶

In [ ]:
data_all_ef = pd.read_csv(data_path)
X_all = data_all_ef.iloc[:, :-1]
y_all = data_all_ef.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_train = y_train.reset_index(drop=True)

## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)

input_cols = X_all.columns.tolist()
In [ ]:
data_all_tree = pd.read_csv(data_tree_path)
X_allt = data_all_tree.iloc[:, :-1]
y_allt = data_all_tree.iloc[:, -1]

X_traint, X_testt, y_traint, y_testt = train_test_split(X_all, y_all, test_size=0.2, random_state=2023)
y_traint = y_traint.reset_index(drop=True)

## 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)
In [ ]:
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 669 entries, 545 to 537
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   material_0                   669 non-null    int64  
 1   initial CN(%)                669 non-null    float64
 2   initial moisture content(%)  669 non-null    float64
 3   initial pH                   669 non-null    float64
 4   material_1                   669 non-null    int64  
 5   Excipients                   669 non-null    int64  
 6   initial TN(%)                669 non-null    float64
 7   initial TC(%)                669 non-null    float64
 8   Additive Species             669 non-null    int64  
dtypes: float64(5), int64(4)
memory usage: 52.3 KB
In [ ]:
X_train.describe()
Out[ ]:
material_0 initial CN(%) initial moisture content(%) initial pH material_1 Excipients initial TN(%) initial TC(%) Additive Species
count 669.000000 669.000000 669.000000 669.000000 669.000000 669.000000 669.000000 669.000000 669.000000
mean 3.292975 13.719288 50.376727 4.942949 2.887892 55.754858 -0.119390 11.450745 3.608371
std 1.445920 11.782051 29.399692 3.882304 1.292882 26.505163 1.972162 31.433285 0.928099
min 0.000000 -1.000000 -1.000000 -1.000000 0.000000 0.000000 -1.000000 -1.000000 0.000000
25% 3.000000 -1.000000 48.000000 -1.000000 3.000000 38.000000 -1.000000 -1.000000 4.000000
50% 3.000000 15.390000 63.000000 6.900000 3.000000 71.000000 -1.000000 -1.000000 4.000000
75% 5.000000 21.373333 70.000000 7.800000 4.000000 78.000000 -1.000000 -1.000000 4.000000
max 6.000000 53.008273 88.500000 10.320000 4.000000 78.000000 14.200000 197.000000 4.000000
In [ ]:
X_train
Out[ ]:
material_0 initial CN(%) initial moisture content(%) initial pH material_1 Excipients initial TN(%) initial TC(%) Additive Species
545 6 41.00000 56.900000 7.83 4 78 3.9 159.9 4
175 3 -1.00000 -1.000000 6.85 2 78 -1.0 -1.0 3
11 3 -1.00000 -1.000000 -1.00 3 78 -1.0 -1.0 0
121 3 -1.00000 -1.000000 -1.00 0 78 -1.0 -1.0 3
714 5 13.56000 71.050000 8.08 4 45 -1.0 -1.0 4
... ... ... ... ... ... ... ... ... ...
515 5 18.40000 55.300000 6.44 4 49 -1.0 -1.0 4
732 3 6.10000 71.000000 6.70 2 12 -1.0 -1.0 4
695 0 37.83542 65.276961 7.30 4 78 -1.0 -1.0 4
454 3 16.60000 53.800000 7.90 2 13 -1.0 -1.0 4
537 6 -1.00000 70.000000 8.40 4 78 -1.0 -1.0 4

669 rows × 9 columns

In [ ]:
y_train.isnull().any()
Out[ ]:
False
In [ ]:
print(X_train.isnull().any())
material_0                     False
initial CN(%)                  False
initial moisture content(%)    False
initial pH                     False
material_1                     False
Excipients                     False
initial TN(%)                  False
initial TC(%)                  False
Additive Species               False
dtype: bool
In [ ]:
np.isnan(X_train).any()
Out[ ]:
material_0                     False
initial CN(%)                  False
initial moisture content(%)    False
initial pH                     False
material_1                     False
Excipients                     False
initial TN(%)                  False
initial TC(%)                  False
Additive Species               False
dtype: bool

树模型预测(RF,XGB,LGB,CTB)¶

RF¶

In [ ]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=1000, criterion='mse',
                            max_depth=10, min_samples_split=2,
                            min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                            max_features='auto',  max_leaf_nodes=None,
                            bootstrap=True, oob_score=False,
                            n_jobs=1, random_state=None,
                            verbose=0, warm_start=False)

rf_preds = np.zeros(X_train.shape[0])

for train_index, valid_index in kf.split(X_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    print(x_tr.shape, y_tr.shape)
    rf_model.fit(x_tr, y_tr)
    rf_pred_valid = rf_model.predict(x_va)
    rf_preds[valid_index] = rf_pred_valid

    print('The rmse of the RFRegression is:',metrics.mean_squared_error(y_va,rf_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, rf_pred_valid, 'r', label='xgb predict')
    plt.legend(loc='best')
    plt.title('xgb-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the RFRegression is: 72.33406927962925
(535, 9) (535,)
The rmse of the RFRegression is: 120.24513283097679
(535, 9) (535,)
The rmse of the RFRegression is: 57.01293697537265
(535, 9) (535,)
The rmse of the RFRegression is: 72.7852758294013
(536, 9) (536,)
The rmse of the RFRegression is: 47.14527417335798
In [ ]:
rf_pred_test = rf_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,rf_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, rf_pred_test, 'r', label='rf predict')
plt.legend(loc='best')
plt.title('rf-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(rf_model, f'{model_save_file}/rf_model.pkl') 
result_model['RandomForest(k)'] = metrics.mean_squared_error(y_test,rf_pred_test)
The rmse of the test is: 87.06266377717145

XGB¶

In [ ]:
import xgboost as xgb

xgb_preds = np.zeros(X_train.shape[0])
kf = KFold(n_splits=num_folds, shuffle=True)
xgb_model = xgb.XGBRegressor(max_depth=8,
                        learning_rate=0.1,
                        n_estimators=300,
                        n_jobs=4,
                        colsample_bytree=0.8,
                        subsample=0.8,
                        random_state=32,
                        tree_method='hist'
                        )

for train_index, valid_index in kf.split(X_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    print(x_tr.shape, y_tr.shape)
    xgb_model.fit(x_tr, y_tr)
    xgb_pred_valid = xgb_model.predict(x_va)
    xgb_preds[valid_index] = xgb_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_va,xgb_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, xgb_pred_valid, 'r', label='xgb predict')
    plt.legend(loc='best')
    plt.title('xgb-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the XgbRegression is: 73.41349618255467
(535, 9) (535,)
The rmse of the XgbRegression is: 91.57950901758214
(535, 9) (535,)
The rmse of the XgbRegression is: 74.12756581448943
(535, 9) (535,)
The rmse of the XgbRegression is: 87.34311628843
(536, 9) (536,)
The rmse of the XgbRegression is: 79.56812684878001
In [ ]:
xgb_pred_test = xgb_model.predict(X_test)
print('The rmse of the XgbRegression is:',metrics.mean_squared_error(y_test, xgb_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, xgb_pred_test, 'r', label='xgb predict')
plt.legend(loc='best')
plt.title('xgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(xgb_model, f'{model_save_file}/xgb_model.pkl') 
result_model['XGBoost(k)'] = metrics.mean_squared_error(y_test,xgb_pred_test)
The rmse of the XgbRegression is: 94.74935148490458

LGB¶

In [ ]:
import lightgbm as lgb

lgb_preds = np.zeros(X_traint.shape[0])
for i, (train_index, valid_index) in enumerate(kf.split(X_traint, y_traint)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_tr, y_tr, X_va, y_va = X_traint.iloc[train_index], \
        y_traint.iloc[train_index], X_traint.iloc[valid_index], y_traint.iloc[valid_index]

    params_lgb = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'learning_rate': 0.1,
                'n_estimators': 300,
                'metric': 'root_mean_squared_error',
                'min_child_weight': 1e-3,
                'min_child_samples': 10,
                'num_leaves': 31,
                'max_depth': -1,
                'seed': 2023,
                'verbose': -1,
    }
    
    lgb_model = lgb.LGBMRegressor(**params_lgb)
    lgb_model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_va, y_va)], eval_metric=['mae','rmse'])
    lgb_val_pred = lgb_model.predict(X_va)
    lgb_preds[valid_index] = lgb_val_pred

    print('The rmse of the LgbRegression is:',metrics.mean_squared_error(y_va,lgb_val_pred))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, lgb_val_pred, 'r', label='lgb predict')
    plt.legend(loc='best')
    plt.title('lgb-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
************************************ 1 ************************************
The rmse of the LgbRegression is: 63.47356331282958
************************************ 2 ************************************
The rmse of the LgbRegression is: 57.67203190953856
************************************ 3 ************************************
The rmse of the LgbRegression is: 62.06630210427079
************************************ 4 ************************************
The rmse of the LgbRegression is: 115.09563250500037
************************************ 5 ************************************
The rmse of the LgbRegression is: 80.12872820839479
In [ ]:
lgb_pred_test = lgb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,lgb_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x[:100], y_testt[:100], 'g', label='ground-turth')
plt.plot(x[:100], lgb_pred_test[:100], 'r', label='lgb predict')
plt.legend(loc='best')
plt.title('lgb-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(lgb_model, f'{model_save_file}/lgb_model.pkl') 
result_model['Lightgbm(k)'] = metrics.mean_squared_error(y_testt,lgb_pred_test)
The rmse of the test is: 90.01997376916715

CTB¶

In [ ]:
from catboost import CatBoostRegressor

params = {
    'learning_rate': 0.05,
    'loss_function': "RMSE",
    'eval_metric': "RMSE", # CrossEntropy
    'depth': 8,
    'min_data_in_leaf': 20,
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'one_hot_max_size': 5,   #类别数量多于此数将使用ordered target statistics编码方法,默认值为2。
    'boosting_type':"Ordered", #Ordered 或者Plain,数据量较少时建议使用Ordered,训练更慢但能够缓解梯度估计偏差。
    'max_ctr_complexity': 2, #特征组合的最大特征数量,设置为1取消特征组合,设置为2只做两个特征的组合,默认为4。
    'nan_mode': 'Min' 
}
iterations = 400
early_stopping_rounds = 200
ctb_model = CatBoostRegressor(iterations=iterations,
    early_stopping_rounds = early_stopping_rounds,
    **params)

for i, (train_index, valid_index) in enumerate(kf.split(X_traint)):
    print('************************************ {} ************************************'.format(str(i+1)))
    x_tr, x_va = X_traint.iloc[train_index], X_traint.iloc[valid_index]
    y_tr, y_va = y_traint.iloc[train_index], y_traint.iloc[valid_index]
    
    print(x_tr.shape, y_tr.shape)
    ctb_model.fit(x_tr, y_tr, eval_set=(x_va, y_va), verbose=0, early_stopping_rounds=1000)
    ctb_pre= ctb_model.predict(x_va)

    print('The rmse of the CatRegression is:',metrics.mean_squared_error(y_va,ctb_pre))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, ctb_pre, 'r', label='cat predict')
    plt.legend(loc='best')
    plt.title('cat-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
************************************ 1 ************************************
(535, 9) (535,)
The rmse of the CatRegression is: 65.41703351619354
************************************ 2 ************************************
(535, 9) (535,)
The rmse of the CatRegression is: 79.09693900072871
************************************ 3 ************************************
(535, 9) (535,)
The rmse of the CatRegression is: 60.165751627339866
************************************ 4 ************************************
(535, 9) (535,)
The rmse of the CatRegression is: 73.7228345960462
************************************ 5 ************************************
(536, 9) (536,)
The rmse of the CatRegression is: 72.28120593473213
In [ ]:
cat_pred_test = ctb_model.predict(X_testt)
print('The rmse of the test is:',metrics.mean_squared_error(y_testt,cat_pred_test))
x = list(range(0, len(y_testt)))
plt.figure(figsize=(40,10))
plt.plot(x, y_testt, 'g', label='ground-turth')
plt.plot(x, cat_pred_test, 'r', label='cat predict')
plt.legend(loc='best')
plt.title('cat-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(ctb_model, f'{model_save_file}/ctb_model.pkl') 
result_model['Catboost(k)'] = metrics.mean_squared_error(y_test,cat_pred_test)
The rmse of the test is: 99.7637237685675

使用其他机器学习方法预测¶

岭回归¶

In [ ]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

ri_model = Ridge(alpha=0.5,fit_intercept=True)

lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    ri_model.fit(x_tr, y_tr)
    lr_pred_valid = ri_model.predict(x_va)
    lr_preds[valid_index] = lr_pred_valid

    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, lr_pred_valid, 'r', label='lr predict')
    plt.legend(loc='best')
    plt.title('lr-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the LR is: 118.5247762940648
(535, 9) (535,)
The rmse of the LR is: 91.32071794094006
(535, 9) (535,)
The rmse of the LR is: 120.4805198773532
(535, 9) (535,)
The rmse of the LR is: 102.9025768099061
(536, 9) (536,)
The rmse of the LR is: 145.73062685873657
In [ ]:
ri_pred_test = ri_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,ri_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, ri_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(ri_model, f'{model_save_file}/ri_model.pkl') 
result_model['RidgeRegression(k)'] = metrics.mean_squared_error(y_test,ri_pred_test)
The rmse of the test is: 153.13336047251556

线性回归¶

In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

lr_model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)

lr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    lr_model.fit(x_tr, y_tr)
    lr_pred_valid = lr_model.predict(x_va)
    lr_preds[valid_index] = lr_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,lr_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, lr_pred_valid, 'r', label='lr predict')
    plt.legend(loc='best')
    plt.title('lr-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the LR is: 113.31311205133704
(535, 9) (535,)
The rmse of the LR is: 93.11116165851249
(535, 9) (535,)
The rmse of the LR is: 105.81811908648206
(535, 9) (535,)
The rmse of the LR is: 167.33796204169673
(536, 9) (536,)
The rmse of the LR is: 103.7560669453761
In [ ]:
lr_pred_test = lr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,lr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, lr_pred_test, 'r', label='lr predict')
plt.legend(loc='best')
plt.title('lr-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(lr_model, f'{model_save_file}/lr_model.pkl')
result_model['LinearRegression(k)'] = metrics.mean_squared_error(y_test,lr_pred_test)
The rmse of the test is: 152.84409567861422

使用MLP进行预测¶

In [ ]:
from sklearn.neural_network import MLPRegressor

mlp_model = MLPRegressor(solver='lbfgs',alpha=1e-5, hidden_layer_sizes=(40,40), max_iter=500, random_state=2023)

mlp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    mlp_model.fit(x_tr, y_tr)
    mlp_pred_valid = mlp_model.predict(x_va)
    mlp_preds[valid_index] = mlp_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the MLP is:',metrics.mean_squared_error(y_va,mlp_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, mlp_pred_valid, 'r', label='mlp predict')
    plt.legend(loc='best')
    plt.title('mlp-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the MLP is: 64.54947984165855
(535, 9) (535,)
The rmse of the MLP is: 100.48187981995927
(535, 9) (535,)
The rmse of the MLP is: 91.98040411391537
(535, 9) (535,)
The rmse of the MLP is: 146.4392203169691
(536, 9) (536,)
The rmse of the MLP is: 100.49902790636249
In [ ]:
mlp_pred_test = mlp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,mlp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, mlp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(mlp_model, f'{model_save_file}/mlp_model.pkl')
result_model['MLP(k)'] = metrics.mean_squared_error(y_test,mlp_pred_test)
The rmse of the test is: 142.08429708042357

SVR进行预测¶

In [ ]:
from sklearn.svm import SVR

svr_model = SVR(kernel ='rbf',
                degree = 3,
                coef0 = 0.0,
                tol = 0.001,
                C = 1.0,
                epsilon = 0.1,
                shrinking = True,
                cache_size = 200,
                verbose = False,
                max_iter = -1)

svr_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    svr_model.fit(x_tr, y_tr)
    svr_pred_valid = svr_model.predict(x_va)
    svr_preds[valid_index] = svr_pred_valid

    # print('The accuracy of the Logistic Regression is:',metrics.accuracy_score(y_valid,xgb_pred_valid))
    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,svr_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, svr_pred_valid, 'r', label='svr predict')
    plt.legend(loc='best')
    plt.title('svr-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the LR is: 138.4509977311952
(535, 9) (535,)
The rmse of the LR is: 112.88574994132262
(535, 9) (535,)
The rmse of the LR is: 97.6472119945676
(535, 9) (535,)
The rmse of the LR is: 100.71330341737936
(536, 9) (536,)
The rmse of the LR is: 148.6798739815513
In [ ]:
svr_pred_test = svr_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,svr_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, svr_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(svr_model, f'{model_save_file}/svr_model.pkl')
result_model['SVR(k)'] = metrics.mean_squared_error(y_test,svr_pred_test)
The rmse of the test is: 154.50148948624263

使用GAUSS Rgression进行预测¶

In [ ]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

# 创建高斯过程回归模型对象
gp_model = GaussianProcessRegressor(kernel=RBF())

gp_preds = np.zeros(X_train.shape[0])
for train_index, valid_index in kf.split(X_train, y_train):
    x_tr, x_va = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_va = y_train.iloc[train_index], y_train.iloc[valid_index]

    print(x_tr.shape, y_tr.shape)
    gp_model.fit(x_tr, y_tr)
    gp_pred_valid = gp_model.predict(x_va)
    gp_preds[valid_index] = gp_pred_valid

    print('The rmse of the LR is:',metrics.mean_squared_error(y_va,gp_pred_valid))
    x = list(range(0, len(y_va)))
    plt.figure(figsize=(40,10))
    plt.plot(x, y_va, 'g', label='ground-turth')
    plt.plot(x, gp_pred_valid, 'r', label='gp predict')
    plt.legend(loc='best')
    plt.title('gp-yield')
    plt.xlabel('sample')
    plt.ylabel('Yield_lnRR')
    plt.show()
(535, 9) (535,)
The rmse of the LR is: 178.81596682846134
(535, 9) (535,)
The rmse of the LR is: 242.78209000036935
(535, 9) (535,)
The rmse of the LR is: 231.17116404877655
(535, 9) (535,)
The rmse of the LR is: 240.36255291792932
(536, 9) (536,)
The rmse of the LR is: 168.86409456256115
In [ ]:
gp_pred_test = gp_model.predict(X_test)
print('The rmse of the test is:',metrics.mean_squared_error(y_test,gp_pred_test))
x = list(range(0, len(y_test)))
plt.figure(figsize=(40,10))
plt.plot(x, y_test, 'g', label='ground-turth')
plt.plot(x, gp_pred_test, 'r', label='mlp predict')
plt.legend(loc='best')
plt.title('mlp-yield')
plt.xlabel('sample')
plt.ylabel('Yield_lnRR')
plt.show()

joblib.dump(gp_model, f'{model_save_file}/gp_model.pkl')
result_model['GR(k)'] = metrics.mean_squared_error(y_test,gp_pred_test)
The rmse of the test is: 259.07925065377

不同模型的效果比对分析(RMSE指标)¶

In [ ]:
import json

# 将字典保存为 JSON 文件
with open(model_performance_json, 'w') as json_file:
    json.dump(result_model, json_file)
In [ ]:
import plotly.express as px
import plotly.io as pio

with open(model_performance_json, 'r') as json_file:
    result_model = json.load(json_file)
    
result_model = dict(sorted(result_model.items(), key=lambda item: item[1]))
categories = list(result_model.keys())
values = list(result_model.values())

color_mapping = {}
for k, v in result_model.items():
    if v < 0.4:
        color_mapping[k] = "Tree-Base"
    else:
        color_mapping[k] = "Other"
    
# 创建柱状图
fig = px.bar(x=categories, y=values, title='Models Performance in ln(EF)', color=color_mapping)
fig.update_layout(template="seaborn")
# 显示图表
fig.show()
# 保存柱状图为 HTML 文件
pio.write_html(fig, file=model_performance_path)  

挑选合适的模型进行进一步的变量分析(Tree-Base)¶

In [ ]:
import shap
shap.initjs()

变量重要性分析¶

RF模型分析--变量重要性分析¶
In [ ]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly

feat_importance=pd.DataFrame()
feat_importance["Importance"]=rf_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()
Xgboost¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=xgb_model.feature_importances_
feat_importance.set_index(X_train.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()
Catboost¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()
Lightbgm¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()

SHAP是由Shapley value启发的可加性解释模型。对于每个预测样本,模型都产生一个预测值,SHAP value就是该样本中每个特征所分配到的数值。 很明显可以看出,与上一节中feature importance相比,SHAP value最大的优势是SHAP能对于反映出每一个样本中的特征的影响力,而且还表现出影响的正负性。

In [ ]:
# 获取feature importance
plt.figure(figsize=(15, 5))

feat_importance=pd.DataFrame()
feat_importance["Importance"]=lgb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance', ascending=False)

plt.bar(range(len(X_traint.columns)), feat_importance['Importance'])
plt.xticks(range(len(X_traint.columns)), feat_importance.index, rotation=90, fontsize=14)
plt.title('Feature importance', fontsize=14)
plt.show()
In [ ]:
lgb_explainer = shap.TreeExplainer(lgb_model)
lgb_shap_values = lgb_explainer.shap_values(X_traint)
print(lgb_shap_values.shape)
(669, 9)
In [ ]:
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values[0,:], X_traint.iloc[0,:])
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

对第一个实例的特征贡献图也可用 waterfall 方式展示

In [ ]:
shap.plots._waterfall.waterfall_legacy(
    lgb_explainer.expected_value, 
    lgb_shap_values[0], 
    feature_names=X_traint.columns)

上图的解释显示了每个有助于将模型输出从基值(我们传递的训练数据集上的平均模型输出)贡献到模型输出值的特征。将预测推高的特征以红色显示,将预测推低的特征以蓝色显示。

如果我们采取许多实例来聚合显示解释,如下图所示,将它们旋转 90 度,然后将它们水平堆叠,我们可以看到整个数据集的解释(在 Notebook 中,此图是交互式的)

In [ ]:
# visualize the training set predictions
shap.force_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint)
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

下图中每一行代表一个特征,横坐标为SHAP值。一个点代表一个样本,颜色越红说明特征本身数值越大,颜色越蓝说明特征本身数值越小。

In [ ]:
shap.summary_plot(lgb_shap_values, X_traint)

shap value值排序的特征重要性

In [ ]:
shap.summary_plot(lgb_shap_values, X_traint, plot_type="bar")

SHAP也提供了部分依赖图的功能,与传统的部分依赖图不同的是,这里纵坐标不是目标变量y的数值而是SHAP值。可以观察各个特征的分布与目标shap值的关系。

In [ ]:
fig, axes = plt.subplots(len(input_cols)//3+1, 3, figsize=(20,30))
for i, col in enumerate(input_cols):
    shap.dependence_plot(col, lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[i//3,i%3])
    # shap.dependence_plot('Sand (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[0,2])
    # shap.dependence_plot('Silt (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,0])
    # shap.dependence_plot('Clay (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,1])
    # shap.dependence_plot('SOC (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[1,2])
    # shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,1])
    # shap.dependence_plot('C/N', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[2,2])
    # shap.dependence_plot('pH', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,0])
    # shap.dependence_plot('TN (%)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,1])
    # shap.dependence_plot('BD', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[3,2])
    # shap.dependence_plot('CEC', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,0])
    # shap.dependence_plot('N application', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
    # shap.dependence_plot('BNE', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
    # shap.dependence_plot('MAT (°C)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
    # shap.dependence_plot('MAP (mm)', lgb_shap_values, X_traint, interaction_index=None, show=False, ax=axes[4,1])
  • 对多个变量的交互进行分析

我们也可以多个变量的交互作用进行分析。一种方式是采用summary_plot描绘出散点图,如下:

In [ ]:
shap_interaction_values = shap.TreeExplainer(lgb_model).shap_interaction_values(X_traint)
plt.figure(figsize=(12,12))
shap.summary_plot(shap_interaction_values, X_traint, max_display=6)
plt.show()
<Figure size 1200x1200 with 0 Axes>

我们也可以用dependence_plot描绘两个变量交互下变量对目标值的影响。

In [ ]:
shap.dependence_plot(input_cols[0], lgb_shap_values, X_traint, interaction_index=input_cols[1], show=False)

也能可视化每种特征对于整体预测值的影响。

In [ ]:
shap.decision_plot(lgb_explainer.expected_value, lgb_shap_values, X_traint, ignore_warnings=True)
Catboost¶
In [ ]:
feat_importance=pd.DataFrame()
feat_importance["Importance"]=ctb_model.feature_importances_
feat_importance.set_index(X_traint.columns, inplace=True)

feat_importance = feat_importance.sort_values(by='Importance',ascending=True)
pal=sns.color_palette("plasma_r", 70).as_hex()[2:]

fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['Importance'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['Importance'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.5f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=700, width=800)
fig.show()